In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, json, time
from matplotlib_venn import venn2, venn3
from scipy import stats
from utility_functions import *
DATE
Out[1]:
'20250630'
Sankey plot¶
In [2]:
from pySankey.sankey import sankey as sankeyplot
import plotly.graph_objects as go
In [3]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
filtering = 'global'
# filtering = 'hybrid'
data = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
data.append(pd.read_csv(os.path.join(working_folder, dataset_name,
f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}-outerjoin.csv.gz')))
for _ in data:
print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)
F, counts = make_sankey_plot_with_counts(data, suffixes=['_closedsearch','_opensearch'])
F.write_image(f"publication-data/{DATE}-Sankey-closedsearch-opensearch-{filtering}-filtering.svg")
F.show()
(128800, 61) (46757, 61) (165822, 61) (341379, 61)
In [4]:
# searches overall overlap
closed_search_identified_spectra = data[~data.database_closedsearch.isna()].spectrum_title
open_search_identified_spectra = data[~data.database_opensearch.isna()].spectrum_title
open_search_identified_spectra
venn2([set(closed_search_identified_spectra),set(open_search_identified_spectra)],
set_labels=['Closed search','Open search'],
set_colors=sns.color_palette("Set2")[:2])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-closedsearch-opensearch-{filtering}-filtering.svg")
In [5]:
len(set(open_search_identified_spectra))/len(set(closed_search_identified_spectra))
Out[5]:
1.597610958579414
In [6]:
# F = make_sankey_plot_with_counts(data, sankey_labels)
# F.write_image("publication-data/{DATE}-Sankey-plot-closedsearch-opensearch.svg")
# F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','NonCanonical+Unmodified/Expected',
'Decoy','Unidentified'],
['Canonical+Unmodified/Expected','Canonical+Unexpected',
'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 520 | 1367 | 712 | 16104 |
| NonCanonical+Unmodified/Expected | 236 | 605 | 674 | 185 | 33 | 594 |
| Decoy | 254 | 482 | 24 | 48 | 179 | 1152 |
| Unidentified | 67965 | 62394 | 2682 | 3294 | 2536 | 0 |
In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
tmp
341379
Out[8]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 520 | 1367 | 712 | 16104 |
| NonCanonical+Unmodified/Expected | 236 | 605 | 674 | 185 | 33 | 594 |
| Decoy | 254 | 482 | 24 | 48 | 179 | 1152 |
| Unidentified | 67965 | 62394 | 2682 | 3294 | 2536 | 0 |
In [9]:
# Any --> Any
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
184658 54.1%
Out[9]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_closedsearch | |||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 520 | 1367 | 712 |
| NonCanonical+Unmodified/Expected | 236 | 605 | 674 | 185 | 33 |
| Decoy | 254 | 482 | 24 | 48 | 179 |
In [10]:
# Any --> Any
tmp = data3.iloc[:,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
323529 94.8%
Out[10]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_closedsearch | |||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 520 | 1367 | 712 |
| NonCanonical+Unmodified/Expected | 236 | 605 | 674 | 185 | 33 |
| Decoy | 254 | 482 | 24 | 48 | 179 |
| Unidentified | 67965 | 62394 | 2682 | 3294 | 2536 |
In [11]:
# Identified by closed search
tmp = data3.iloc[:-1,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
202508 59.3%
Out[11]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 520 | 1367 | 712 | 16104 |
| NonCanonical+Unmodified/Expected | 236 | 605 | 674 | 185 | 33 | 594 |
| Decoy | 254 | 482 | 24 | 48 | 179 | 1152 |
In [12]:
# Expected --> Expected ptm
tmp = data3.iloc[:2,[0,2]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
163238 47.8%
Out[12]:
| sankey_opensearch | Canonical+Unmodified/Expected | NonCanonical+Unmodified/Expected |
|---|---|---|
| sankey_closedsearch | ||
| Canonical+Unmodified/Expected | 161808 | 520 |
| NonCanonical+Unmodified/Expected | 236 | 674 |
In [13]:
# Expected --> Unexpected ptm
tmp = data3.iloc[:2,[1,3]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
19688 5.8%
Out[13]:
| sankey_opensearch | Canonical+Unexpected | NonCanonical+Unexpected |
|---|---|---|
| sankey_closedsearch | ||
| Canonical+Unmodified/Expected | 17531 | 1367 |
| NonCanonical+Unmodified/Expected | 605 | 185 |
In [14]:
# Unidentified --> Any ID
tmp = data3.iloc[-1,:-2]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
136335
Out[14]:
sankey_opensearch Canonical+Unmodified/Expected 0.50 Canonical+Unexpected 0.46 NonCanonical+Unmodified/Expected 0.02 NonCanonical+Unexpected 0.02 Name: Unidentified, dtype: float64
In [15]:
# NonCanon --> Any
tmp = data3.iloc[1,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
2327
Out[15]:
sankey_opensearch Canonical+Unmodified/Expected 0.10 Canonical+Unexpected 0.26 NonCanonical+Unmodified/Expected 0.29 NonCanonical+Unexpected 0.08 Decoy 0.01 Unidentified 0.26 Name: NonCanonical+Unmodified/Expected, dtype: float64
In [16]:
# Canon --> Any
tmp = data3.iloc[0,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),3)
198042
Out[16]:
sankey_opensearch Canonical+Unmodified/Expected 0.817 Canonical+Unexpected 0.089 NonCanonical+Unmodified/Expected 0.003 NonCanonical+Unexpected 0.007 Decoy 0.004 Unidentified 0.081 Name: Canonical+Unmodified/Expected, dtype: float64
Compare by-correlation of spectra identified in both searches (closed & open)¶
In [17]:
combo = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
tmp = pd.read_csv(os.path.join(working_folder, dataset_name,
f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}.csv.gz'))
print(tmp.shape)
combo.append(tmp)
del tmp
combo = pd.concat(combo, ignore_index=True)
print(combo.shape)
combo.tail()
(83173, 61) (19200, 61) (82285, 61) (184658, 61)
Out[17]:
| spectrum_title | scan | spectrum_file | precursor_mass_closedsearch | database_peptide_closedsearch | matched_peptide_closedsearch | modifications_closedsearch | database_closedsearch | psm_score_closedsearch | global_q_closedsearch | ... | by-explained_opensearch | b-explained_opensearch | y-explained_opensearch | all-explained_opensearch | by-intensity-pattern-correlation_opensearch | top_tag_rank_nterm_opensearch | top_tag_rank_cterm_opensearch | top_tag_rank_opensearch | predicted_retention_time_opensearch | retention_time_error_adjusted_opensearch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 184653 | AM9:controllerType=0 controllerNumber=1 scan=1... | 14152 | AM9 | 1936.986225 | ETFLRELISNSSDALDK | ETFLRELISNSSDALDK | Unmodified | T | 0.237943 | 0.003281 | ... | 0.1387 | 673 | 714 | 0.1582 | 0.3885 | 101 | 0 | 0 | 2154.488398 | 337.301222 |
| 184654 | AM9:controllerType=0 controllerNumber=1 scan=5242 | 5242 | AM9 | 984.536291 | VASSQQLPR | VASSQQLPR | Unmodified | T | 0.196643 | 0.003818 | ... | 0.1911 | 232 | 1679 | 0.2505 | 0.9060 | 101 | 71 | 71 | 541.705442 | 785.104618 |
| 184655 | AM9:controllerType=0 controllerNumber=1 scan=2... | 22169 | AM9 | 2372.145727 | KDLYTNTVLSGGTTMYPGIADR | KDLYTNTVLSGGTTMYPGIADR | Unmodified | T | 0.181762 | 0.004363 | ... | 0.3967 | 897 | 3070 | 0.4156 | 0.7955 | 54 | 2 | 2 | 3826.140377 | 51.898757 |
| 184656 | AM9:controllerType=0 controllerNumber=1 scan=1... | 18614 | AM9 | 2085.085425 | QPGPHIIYRCPSALQHIR | QPGPHIIYRCPSALQHIR | Unmodified | T | 0.028571 | 0.008921 | ... | 0.1203 | 444 | 759 | 0.1299 | 0.8856 | 101 | 0 | 0 | 3141.328678 | 94.136818 |
| 184657 | AM9:controllerType=0 controllerNumber=1 scan=1... | 17788 | AM9 | 1684.761083 | SGFATNAASAGGYCRPR | SGFATNAASAGGYCRPR | Unmodified | D | 0.001249 | 0.009685 | ... | 0.2333 | 150 | 2183 | 0.2448 | 0.9301 | 25 | 9 | 9 | 3366.782703 | 421.956723 |
5 rows × 61 columns
In [18]:
combo["Same_peptide"]=combo.matched_peptide_closedsearch==combo.matched_peptide_opensearch
combo["Same_mod_peptide"]=combo.modified_peptide_closedsearch==combo.modified_peptide_opensearch
combo["Same_mods_noRT"]=(combo.matched_peptide_closedsearch+combo.modifications_noRT_closedsearch)==(combo.matched_peptide_opensearch+combo.modifications_noRT_opensearch)
def samepep_and_samemod(row):
if row.Same_peptide and row.Same_mods_noRT:
return 'Same_peptidoform'
elif row.Same_peptide and not row.Same_mods_noRT:
return 'Positional_isoform'
elif not row.Same_peptide and not row.Same_mods_noRT:
return 'Different_peptide'
combo['samepep_samemod'] = combo.apply(samepep_and_samemod, axis=1)
# print(combo['samepep_samemod'].value_counts())
VAR, VAL = 'samepep_samemod','by-intensity-pattern-correlation'
for i,df in combo.groupby('samepep_samemod').__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
ORDER = ['Same_peptidoform','Positional_isoform','Different_peptide']
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','samepep_samemod'],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='samepep_samemod', y='value', hue='search', aspect=1.25, showfliers=False, notch=True,
palette='Set2', order=ORDER)
plt.title('Intensity correlation across searches')
plt.ylabel('by-intensity-pattern-correlation')
plt.xlabel(None)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-1-{filtering}.svg")
Different_peptide: p=2.3e-124 True (n=15581) effect size = 0.228 Positional_isoform: p=4.7e-02 True (n=14118) effect size = 0.032 Same_peptidoform: p=9.4e-01 False (n=154959) effect size = 0.000
In [19]:
combo['target_decoy'] = combo.apply(lambda row: f'{row.database_closedsearch}->{row.database_opensearch}', axis=1)
combo.target_decoy.value_counts()
VAR, VAL = 'target_decoy','by-intensity-pattern-correlation'
for i,df in combo.groupby(VAR).__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','target_decoy'],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='target_decoy', y='value', hue='search',
aspect=1.25, showfliers=False, notch=True, palette='Set2', order=['T->T','D->D','D->T','T->D'])
plt.title('Intensity correlation across searches')
plt.xlabel('Target or Decoy hit (closed->open search)')
plt.ylabel('by-intensity-pattern-correlation')
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-2-{filtering}.svg")
D->D: p=2.6e-02 True (n=179) effect size = 0.200 D->T: p=5.5e-43 True (n=808) effect size = 0.661 T->D: p=2.7e-01 False (n=745) effect size = 0.070 T->T: p=1.6e-09 True (n=182926) effect size = 0.019
In [20]:
combo['canon_before_after'] = combo.apply(lambda row: f'{row.isCanonical_closedsearch}->{row.isCanonical_opensearch}', axis=1)
# print(combo.canon_before_after.value_counts(sort=False))
ORDER = [
'Canonical->Canonical',
'NonCanonical->NonCanonical',
'Canonical->NonCanonical',
'NonCanonical->Canonical',
]
VAR, VAL = 'canon_before_after', 'by-intensity-pattern-correlation',
for i,df in combo.groupby(VAR).__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file',VAR],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x=VAR, y='value', hue='search', showfliers=False,
notch=True, palette='Set2', order=ORDER
# aspect=2
)
plt.xlabel('Canonical protein hit (closed->open search)')
plt.xticks(rotation=90)
plt.ylabel(VAL)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-3-{filtering}.svg")
Canonical->Canonical: p=2.6e-07 True (n=180326) effect size = 0.015 Canonical->NonCanonical: p=2.0e-03 True (n=2257) effect size = 0.066 NonCanonical->Canonical: p=3.7e-86 True (n=1137) effect size = 0.855 NonCanonical->NonCanonical: p=1.1e-01 False (n=938) effect size = 0.103
save¶
In [ ]:
# Auto save & export notebook to html!!
autosave(extra_labels='-'+filtering)
filtering